library(pacman)
p_load(gapminder)
p_load(tidyverse)
summary(gapminder)
country continent year lifeExp pop gdpPercap
Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60 Min. :6.001e+04 Min. : 241.2
Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20 1st Qu.:2.794e+06 1st Qu.: 1202.1
Algeria : 12 Asia :396 Median :1980 Median :60.71 Median :7.024e+06 Median : 3531.8
Angola : 12 Europe :360 Mean :1980 Mean :59.47 Mean :2.960e+07 Mean : 7215.3
Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
Australia : 12 Max. :2007 Max. :82.60 Max. :1.319e+09 Max. :113523.1
(Other) :1632
plot(lifeExp ~ year, gapminder)
plot(lifeExp ~ gdpPercap, gapminder)
plot(lifeExp ~ log(gdpPercap), gapminder)
table(gapminder$continent) # cout how many observation per continent
Africa Americas Asia Europe Oceania
624 300 396 360 24
barplot(table(gapminder$continent))
plot(lifeExp ~ year, gapminder, subset = country == "Zimbabwe")
plot(lifeExp ~ log(gdpPercap), gapminder, subset = year == 2007)
subset(gapminder, subset = country == "Cambodia")
subset(gapminder, subset = country %in% c("Japan", "Belgium"), select = c(country, year, lifeExp))
filter(gapminder, country == "Rwanda", year > 1979) # filter rows
gapminder %>% select(year, lifeExp) %>% head(4) # select columns
my_gap <- gapminder
my_gap %>% mutate(gdp_billion = pop * gdpPercap/1e+09,
popMil = round(pop/1e+06, 1),
total_years = pop * lifeExp) # create new columns
my_gap %>% arrange(year, country) # sort by year and coutry
my_gap %>% rename(life_exp = lifeExp, gdp_percap = gdpPercap) # rename fields
my_gap %>% group_by(continent) %>% summarize(n = n(), n_countries = n_distinct(country))
my_gap %>% group_by(continent) %>% summarize(avg_lifeExp = mean(lifeExp))
my_gap %>%
select(country, year, continent, lifeExp) %>%
group_by(continent, country) %>%
mutate(le_delta = lifeExp - lag(lifeExp)) %>%
summarize(worst_le_delta = min(le_delta, na.rm = TRUE)) %>%
top_n(-1, wt = worst_le_delta) %>%
arrange(worst_le_delta)
library(pacman)
p_load(car)
p_load(ggsci)
# random grades
set.seed(100)
MathGrade <- rnorm(n = 100, mean = 70, sd = 10)
set.seed(1000)
ReadingGrade <- rnorm(n = 100, mean = 65, sd = 13)
# where and how they took the test
TestLocation <- c(rep("Classroom", 50), rep("Home", 50))
TestFormat <- c(rep("Paper", 25), rep("Electronic", 25), rep("Paper", 25), rep("Electronic", 25))
students <- data.frame(MathGrade, ReadingGrade, TestLocation, TestFormat)
# devide different conditions
PaperTest <- students %>% dplyr::filter(TestFormat == "Paper")
ElectronicTest <- students %>% dplyr::filter(TestFormat == "Electronic")
Classroom <- students %>% dplyr::filter(TestLocation == "Classroom")
Home <- students %>% dplyr::filter(TestLocation == "Home")
# condition with 2 constraints
PaperTestHome <- students %>% dplyr::filter(TestFormat == "Paper", TestLocation == "Home")
PaperTestClassroom <- students %>% dplyr::filter(TestFormat == "Paper", TestLocation == "Classroom")
ElectronicTestHome <- students %>% dplyr::filter(TestFormat == "Electronic", TestLocation == "Home")
ElectronicTestClassroom <- students %>% dplyr::filter(TestFormat == "Electronic", TestLocation =="Classroom")
plot(students$MathGrade, students$ReadingGrade,
main = "Math grade vs. Reading grade",
sub = "All conditions",
xlab = "Math grade",
ylab = "Reading grade",
xlim = c(40, 100),
ylim = c(40, 100),
frame.plot = FALSE)
# car one is power up
car::scatterplot(ReadingGrade ~ MathGrade,
data = students,
smooth = list(degree = 2, style = "none"))
multiple plots with par
main_title <- "Math grade vs. Reading grade"
xlab <- "Math grade"
ylab <- "Reading grade"
op <- par(mfrow = c(2, 2))
#paper test
plot(PaperTest$MathGrade, PaperTest$ReadingGrade,
main = main_title,
sub = "Paper Test",
xlab = xlab, ylab = ylab,
xlim = c(0, 100), ylim = c(0, 100))
#electronic test
plot(ElectronicTest$MathGrade, ElectronicTest$ReadingGrade,
main = main_title,
sub = "Electronic Test",
xlab = xlab, ylab = ylab,
xlim = c(0,100), ylim = c(0, 100))
#classroom test
plot(Classroom$MathGrade, Classroom$ReadingGrade,
main = main_title,
sub = "Classroom",
xlab = xlab, ylab = ylab,
xlim = c(0,100), ylim = c(0, 100))
#classroom test
plot(Home$MathGrade, Home$ReadingGrade,
main = main_title,
sub = "Home",
xlab = xlab, ylab = ylab,
xlim = c(0,100), ylim = c(0, 100))
par(op) #reset the global paramters
plot(PaperTest$MathGrade, PaperTest$ReadingGrade,
main = main_title,
sub = "Paper Test",
xlab = xlab, ylab =
ylab, xlim = c(0,100), ylim = c(0, 100))
#add points to an existing plot
points(ElectronicTest$MathGrade, ElectronicTest$ReadingGrade,
main = main_title,
pch = 2,
col = "blue")
# add a legend
legend("topleft",
legend = c("Paper Test", "Electronic Test"),
col = c("Black", "Blue"),
pch = c(1, 2))
my_cols <- c("#00AFBB", "#E7B800", "#FC4E07")
X <- iris %>% dplyr::select(-Species)
pairs(X, pch=19, lower.panel=NULL, cex=0.5, col=my_cols[iris$Species])
mm <- as_tibble(morley)
#make factors
mm$Expt <- factor(mm$Expt)
mm$Run <- factor(mm$Run)
plot(Speed ~ Expt, data = mm, main = "Speed of Light Data", xlab = "Experiment No.")
#without outliers
boxplot(Speed ~ Expt, data = mm, frame = FALSE, outline = FALSE, main = "Michelson Speed of light data", xlab = "Experiment")
stripchart(Speed ~ Expt, data = mm,
pch = 1:5, col = 1:5,
vertical = TRUE,
method = "jitter",
main = "Speed by Experiment", xlab = "Experiment")
# consider only the 1st three rows, to simplify
x <- VADeaths[1:3, "Rural Male"]
# basic bar plot
barplot(x,
col = c("#999999", "#E69F00", "#56B4E9"),
main = "Death rates in Virginia",
xlab = "Age group",
ylab = "Rate",
horiz = TRUE)
stacked bar plots
# colors
palette <- ggsci::pal_startrek()
my_cols <- palette(5)
op <- par(mfrow = c(1, 2))
barplot(VADeaths, col = my_cols)
legend("topleft", legend = rownames(VADeaths), fill = my_cols, box.lty = 0, cex = 0.8)
barplot(VADeaths, col = my_cols, beside = TRUE)
legend("topleft", legend = rownames(VADeaths), fill = my_cols, box.lty = 0, cex = 0.8)
# data generation
x <- seq(1, 10)
y1 <- x * x
y2 <- 2 * y1
op <- par(mfrow = c(1, 2))
# stair steps plot
plot(x, y1, type = "S", xlab = "x", ylab = "y")
# con le palline
plot(x, y1, type = "b", pch = 19, col = "darkorange", xlab = "x", ylab = "y")
lines(x, y2, pch = 18, type = "b", col = "darkred", lty = 2)
legend("topleft", legend = c("x^2", "2x^2"), col = c("blue",
"darkred"), lty = 1:2, lwd = 2, cex = 0.8)
x <- students$MathGrade
hist(x, col = "steelblue", breaks = 20)
dens <- density(x)
plot(dens, col = "blue", main = "Density of Math grades") # a filled version using polygon():
polygon(dens, col = "blue")
data(faithful)
x <- as_tibble(faithful)
lm_fit <- lm(eruptions ~ waiting, data = x)
summary(lm_fit)
qqnorm(resid(lm_fit), main = "Residuals rankit plot")
qqline(resid(lm_fit))
as_tibble(mtcars)
x <- mtcars %>% dplyr::arrange(mpg)
# group by 'cyl' and color groups
grps <- as.factor(x$cyl)
# select the required number of colors from a custom
# palette
my_cols <- (ggsci::pal_futurama())(nlevels(grps))
dotchart(x$mpg,
labels = rownames(x),
groups = grps,
gcolor = my_cols,
color = my_cols[grps],
cex = 0.6,
pch = 19,
xlab = "mpg")
library(pacman)
p_load(tidyverse)
options(scipen = 999) # turn off scientific notation
data("midwest", package = "ggplot2")
then add pounts aes() is used to tell the graph which part of the dataset we are interested in
par(mfrow = c(2, 2))
g<-ggplot(midwest, aes(x = area, y = poptotal)) +
geom_point(aes(col=state), size=3)+# add points with a different color for each state
geom_smooth(method = "lm") # add an interpolation line
we have 2 option here, the first zooms and consider for regression only the point displayed while the seocnd one only zooms but remember of the outliers
gx <- g + xlim(c(0,0.1)) + ylim(c(0,1000000)) # deletes all the points outiside limits
g2 <- g + coord_cartesian(xlim = c(0,0.1) , ylim= c(0,1000000) )# only zooms in
g3 <- g2 + labs(title = " Area vs Population",
subtitle = "From midwest dataset",
y= "population",
x= "Area",
caption = "midwest demographic")
g4 <- g3 + scale_color_brewer(palette = "Set3")
scale_x_continous is for changing the ticks and the text in the axis even in a complex way using functions
g5 <- g4 + scale_x_continuous(breaks = seq(0, 0.1, 0.01), labels = sprintf("%1.2f%%", seq(0, 0.1, 0.01))) +
scale_y_continuous(breaks = seq(0,1000000, 200000), labels = function(x){paste0(x/1000, 'K')})
g5
`geom_smooth()` using formula 'y ~ x'
gg <- g+scale_x_continuous(breaks = seq(0, 0.1, 0.01))
gg + theme_bw() + labs(subtitle = "BW Theme")
`geom_smooth()` using formula 'y ~ x'
gg + theme_classic()+ labs(subtitle = "classic")
`geom_smooth()` using formula 'y ~ x'
gg<- ggplot(midwest, aes(x = area, y = poptotal)) + # canvas
geom_point(aes(col = state, size = popdensity))+ # pointswith different color and size
geom_smooth(method = "loess", se= F)+ # line
xlim(c(0, 0.1)) + ylim(c(0,500000))+ # zoom
labs(title = "Area Vs Population", y= "Population", x = "Area", caption = "midwest")
plot(gg)
g4 + theme(plot.title=element_text(size=20, face="bold", family="Roboto", color="tomato", hjust=0.5, lineheight=1.2), # title
plot.subtitle=element_text(size=15, family="Roboto",face="bold", hjust=0.5), #
plot.caption=element_text(size=15), # caption
axis.title.x=element_text(vjust=0, size=15), # X axis title
axis.title.y=element_text(size=15), # Y axis title
axis.text.x=element_text(size=10, angle = 30, vjust=.5), # X axis text
axis.text.y=element_text(size=10)) # Y axis text
gg + labs(color = "State", size = "Density")
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 15 rows containing non-finite values (stat_smooth).
Warning: Removed 15 rows containing missing values (geom_point).
gg + scale_color_discrete(name = "State") + scale_size_continuous(name = "Density", guide = F) # giude F hide the legend
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 15 rows containing non-finite values (stat_smooth).
Warning: Removed 15 rows containing missing values (geom_point).
Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
# manually seleect the colours
gg + scale_color_manual(name = "State",
labels = c("Illinois", "Indiana", "Michigan", "Ohio", "winsconsin"),
values = c(IL = "blue",IN = "red", MI = "green", OH = "brown", WI = "orange"))
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 15 rows containing non-finite values (stat_smooth).
Warning: Removed 15 rows containing missing values (geom_point).
# change the order of the legends
gg + guides(colour = guide_legend(order = 1), size = guide_legend(order = 2))
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 15 rows containing non-finite values (stat_smooth).
Warning: Removed 15 rows containing missing values (geom_point).
midwest_sub <- midwest %>% dplyr::filter(poptotal > 300000) # take only big counties
midwest_sub$large_county <- ifelse(midwest_sub$poptotal > 300000, midwest_sub$county, "") # create a new field if large
gg + geom_text(aes(label = large_county), size=2, data= midwest_sub) + # add text only to them
theme(legend.position = "none")
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 15 rows containing non-finite values (stat_smooth).
Warning: Removed 15 rows containing missing values (geom_point).
Warning: Removed 14 rows containing missing values (geom_text).
p_load(ggrepel)
gg + geom_label_repel(aes(label = large_county), size =2, data = midwest_sub) +
theme(legend.position = "none")
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 15 rows containing non-finite values (stat_smooth).
Warning: Removed 15 rows containing missing values (geom_point).
Warning: Removed 14 rows containing missing values (geom_label_repel).
gg + coord_flip()
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 15 rows containing non-finite values (stat_smooth).
Warning: Removed 15 rows containing missing values (geom_point).
gg + scale_x_reverse() + scale_y_reverse()
Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
`geom_smooth()` using formula 'y ~ x'
data(mpg, package = "ggplot2")
basic plot
g <- ggplot(mpg, aes(x= displ, y = hwy)) + geom_point() + labs(title = "hwy vs displ") +
geom_smooth(method = "lm", se = F) + theme_bw()
plot(g)
`geom_smooth()` using formula 'y ~ x'
oi can break this into small plot
g + facet_wrap(~class, nrow = 3) + labs(title = "hwy vs displ")
`geom_smooth()` using formula 'y ~ x'
g + facet_wrap(~class, scales = "free") + labs(title = "hwy vs displ")
`geom_smooth()` using formula 'y ~ x'
g + facet_grid(manufacturer ~ class)
`geom_smooth()` using formula 'y ~ x'
p_load(tidyverse)
there are 8 categories of plotsthat cover the biggest part of them
study how correlated two variables are, usually we use a scatter plot, the geom smooth draws smooting line
theme_set(theme_bw()) # global preset, bw theme
data("midwest", package = "ggplot2")
# midwest <- read.csv('http://goo.gl/G1K41K') # bkup data
# source
# Scatterplot
gg <- ggplot(midwest, aes(x = area, y = poptotal)) + geom_point(aes(col = state,
size = popdensity)) + geom_smooth(method = "loess", se = F) +
xlim(c(0, 0.1)) + ylim(c(0, 5e+05)) + labs(subtitle = "Area Vs Population",
y = "Population", x = "Area", title = "Scatterplot", caption = "Source: midwest")
plot(gg)
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 15 rows containing non-finite values (stat_smooth).
Warning: Removed 15 rows containing missing values (geom_point).
do a circle around some points you want to highlight
p_load(ggalt)
also installing the dependencies ‘later’, ‘extrafontdb’, ‘Rttf2pt1’, ‘htmlwidgets’, ‘lazyeval’, ‘crosstalk’, ‘promises’, ‘proj4’, ‘ash’, ‘maps’, ‘extrafont’, ‘plotly’
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/later_1.3.0.tgz'
Content type 'application/x-gzip' length 623693 bytes (609 KB)
==================================================
downloaded 609 KB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/extrafontdb_1.0.tgz'
Content type 'application/x-gzip' length 6792 bytes
==================================================
downloaded 6792 bytes
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/Rttf2pt1_1.3.10.tgz'
Content type 'application/x-gzip' length 105843 bytes (103 KB)
==================================================
downloaded 103 KB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/htmlwidgets_1.5.4.tgz'
Content type 'application/x-gzip' length 894885 bytes (873 KB)
==================================================
downloaded 873 KB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/lazyeval_0.2.2.tgz'
Content type 'application/x-gzip' length 156515 bytes (152 KB)
==================================================
downloaded 152 KB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/crosstalk_1.2.0.tgz'
Content type 'application/x-gzip' length 406034 bytes (396 KB)
==================================================
downloaded 396 KB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/promises_1.2.0.1.tgz'
Content type 'application/x-gzip' length 1782018 bytes (1.7 MB)
==================================================
downloaded 1.7 MB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/proj4_1.0-11.tgz'
Content type 'application/x-gzip' length 17103507 bytes (16.3 MB)
==================================================
downloaded 16.3 MB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/ash_1.0-15.tgz'
Content type 'application/x-gzip' length 29954 bytes (29 KB)
==================================================
downloaded 29 KB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/maps_3.4.0.tgz'
Content type 'application/x-gzip' length 3106040 bytes (3.0 MB)
==================================================
downloaded 3.0 MB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/extrafont_0.18.tgz'
Content type 'application/x-gzip' length 54341 bytes (53 KB)
==================================================
downloaded 53 KB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/plotly_4.10.0.tgz'
Content type 'application/x-gzip' length 3115123 bytes (3.0 MB)
==================================================
downloaded 3.0 MB
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/ggalt_0.4.0.tgz'
Content type 'application/x-gzip' length 2361598 bytes (2.3 MB)
==================================================
downloaded 2.3 MB
The downloaded binary packages are in
/var/folders/g9/qnw0ry294vs3gs4801w53bb80000gn/T//RtmpslQMjZ/downloaded_packages
ggalt installed
midwest_select <- midwest %>% dplyr::filter(poptotal > 350000,
poptotal <= 500000,
area > 0.01,
area < 0.1)
# Plot
ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) + # draw points
geom_smooth(method="loess", se=FALSE) + # draw smoothing line
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) +
geom_encircle(aes(x=area, y=poptotal),
data=midwest_select, # filtered dataframe
color="red",
size=2,
expand=0.08) + # expand the curve a little bit outside the points
labs(subtitle="Area Vs Population",
y="Population",
x="Area",
title="Scatterplot + Encircle",
caption="Source: midwest")
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 15 rows containing non-finite values (stat_smooth).
Warning: Removed 15 rows containing missing values (geom_point).
whrn the data is integers we may have many overlapping poits, using jitter we can add some random noise to see all the points
data(mpg, package = "ggplot2") # alternate source: 'http://goo.gl/uEeRGu')
theme_set(theme_bw())
g <- ggplot(mpg, aes(cty, hwy))
g +geom_jitter(width = 0.5, size = 1) +
geom_smooth(method = "lm",se = FALSE) +
labs(subtitle = "mpg: city vs highway mileage", y = "hwy", x = "cty", title = "Jittered Points")
`geom_smooth()` using formula 'y ~ x'
instead of adding noise we can do a bigger point when ther is overlapping
g + geom_count(col = "tomato3", show.legend = TRUE) +
labs(subtitle = "mpg: city vs highway mileage", y = "hwy", x = "cty", title = "Counts Plot")
scatter is for comparing the relationship between two continuos variables while a bubble if you want the relationship whithin the group based on : 1) a categorical value (color) 2) a contonuos variable ( size)
mpg_select <- mpg %>%
dplyr::filter(manufacturer %in% c("audi", "ford", "honda",
"hyundai"))
g <- ggplot(mpg_select, aes(displ, cty)) + labs(subtitle = "mpg: City Mileage vs. Displacement",
title = "Bubble chart")
g + geom_jitter(aes(col = manufacturer, size = hwy)) + geom_smooth(aes(col = manufacturer),
method = "lm", se = F)
`geom_smooth()` using formula 'y ~ x'
relationship and distribution in the same graph
p_load(ggExtra)
g <- ggplot(mpg, aes(cty, hwy)) +
geom_count(show.legend = FALSE) + # size
geom_smooth(method = "lm", se = F) # line
ggMarginal(g, type = "histogram", fill = "transparent") # add marginal distribution
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
ggMarginal(g, type = "boxplot", fill = "transparent")
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
ggMarginal(g, type = "density", fill = "transparent")
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
ggMarginal(g, type = "densigram") # density + histogram
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
Correlograms let you examine the correlation of multiple continuous variables present in the same dataframe
p_load(ggcorrplot)
data(mtcars)
dim(mtcars)
[1] 32 11
#> [1] 32 11
# compute the correlation matrix
corr <- round(cor(mtcars), 1)
# plot
ggcorrplot(corr,
hc.order = F, # order the corr. matrix by hierarchical clustering
type = "lower",
lab = TRUE, # add corr. coefficients
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"), # colors for low, mid, high correlation values
title="Correlogram of mtcars",
ggtheme=theme_bw)
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
compare variation in values between small number of items
data("mtcars")
# data prep
mtcars <- tibble::rownames_to_column(mtcars, var="car name") %>% # create new column for car names
mutate(mpg_z=round(scale(mpg), 2), # compute normalized mpg
mpg_type=ifelse(mpg_z < 0, "below", "above"),) %>% # above / below avg flag
arrange(mpg_z)# sort
mtcars$`car name` <- factor(mtcars$`car name`, levels = mtcars$`car name`) # convert to factor to retain sorted order in plot.
# diverging bars
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) +
geom_bar(stat="identity", aes(fill=mpg_type), width=.5) +
scale_fill_manual(name="Mileage",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
labs(subtitle="Normalized mileage from mtcars",
title= "Diverging Bars") +
coord_flip() +
theme_bw()
ggplot(mtcars, aes(x = `car name`, y = mpg_z, label = mpg_z)) +
geom_point(stat = "identity", fill = "black", size = 6) +
geom_segment(aes(y = 0, x = `car name`, yend = mpg_z, xend = `car name`),color = "blue") +
geom_text(color = "white", size = 2) +
labs(title = "Diverging Lollipop Chart", subtitle = "Normalized mileage from mtcars: Lollipop") +
ylim(-2.5, 2.5) + coord_flip() + theme_bw()
ggplot(mtcars, aes(x = `car name`, y = mpg_z, label = mpg_z)) +
geom_point(stat = "identity", aes(col = mpg_type), size = 6) +
scale_color_manual(name = "Mileage", labels = c("Above Average", "Below Average"), values = c(above = "#00ba38", below = "#f8766d")) +
geom_text(color = "white", size = 2) +
labs(title = "Diverging Dot Plot", subtitle = "Normalized mileage from 'mtcars': Dotplot") +
ylim(-2.5, 2.5) + coord_flip() + theme_bw()
data("economics", package = "ggplot2")
# Compute %Returns
economics$returns_perc <- c(0, diff(economics$psavert)/economics$psavert[-length(economics$psavert)])
# Create break points and labels for axis ticks
brks <- economics$date[seq(1, length(economics$date), 12)]
lbls <- lubridate::year(brks)
# plot the 1st 100 observations
ggplot(economics[1:100, ], aes(date, returns_perc)) + geom_area() +
scale_x_date(breaks = brks, labels = lbls) + labs(title = "Area Chart",
subtitle = "Percentage Returns for Personal Savings", y = "% Returns for Personal savings",
caption = "Source: economics dataset") + theme_bw() + theme(axis.text.x = element_text(angle = 90))
A ranking plot is used to compare the position or performance of multiple items with respect to each other. Actual values matter somewhat less than the ranking.
# data prep: group mean city mileage by manufacturer.
cty_mpg <- mpg %>% group_by(make = manufacturer) %>% summarise(mileage = mean(cty))
cty_mpg <- arrange(cty_mpg, mileage) # sort
cty_mpg$make <- factor(cty_mpg$make, levels = cty_mpg$make) # refactor to retain the order in plot.
# Draw plot
ggplot(cty_mpg, aes(x = make, y = mileage)) + geom_bar(stat = "identity",
width = 0.5, fill = "tomato3") + labs(title = "Ordered Bar Chart",
subtitle = "Make Vs Avg. Mileage", caption = "source: mpg") +
theme_bw() + theme(axis.text.x = element_text(angle = 65,
vjust = 0.6))
# Draw plot
ggplot(cty_mpg, aes(x = make, y = mileage)) + geom_point(size = 3) +
geom_segment(aes(x = make, xend = make, y = 0, yend = mileage)) +
labs(title = "Lollipop Chart", subtitle = "Make Vs Avg. Mileage",
caption = "source: mpg") + theme_bw() + theme(axis.text.x = element_text(angle = 65,
vjust = 0.6))
ggplot(cty_mpg, aes(x=make, y=mileage)) +
geom_point(col="tomato2", size=3) + # draw points
geom_segment(aes(x=make,
xend=make,
y=min(mileage),
yend=max(mileage)),
linetype="dashed", # draw dashed lines
size=0.1) +
labs(title="Dot Plot",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
coord_flip() +
theme_classic()
library(scales)
Attaching package: ‘scales’
The following object is masked from ‘package:purrr’:
discard
The following object is masked from ‘package:readr’:
col_factor
# data prep
dataf <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/gdppercap.csv")
colnames(dataf) <- c("continent", "1952", "1957")
# prepare labels
left_label <- paste(dataf$continent, round(dataf$`1952`), sep=", ")
right_label <- paste(dataf$continent, round(dataf$`1957`), sep=", ")
dataf <- dataf %>% mutate(class=ifelse(`1957` - `1952` < 0, "red", "green"))
p <- ggplot(dataf) + geom_segment(aes(x=1, xend=2, y=`1952`, yend=`1957`, col=class), size=.75, show.legend=F) +
geom_vline(xintercept=1, linetype="dashed", size=.1) +
geom_vline(xintercept=2, linetype="dashed", size=.1) +
scale_color_manual(labels = c("Up", "Down"),
values = c("green"="#00ba38", "red"="#f8766d")) + # color of lines
labs(x="", y="Mean GdpPerCap") + # Axis labels
xlim(.5, 2.5) + ylim(0,(1.1*(max(dataf$`1952`, dataf$`1957`)))) +
theme_classic()
# add texts
p <- p + geom_text(label=left_label, y=dataf$`1952`, x=rep(1, NROW(dataf)), hjust=1.1, size=3.5)
p <- p + geom_text(label=right_label, y=dataf$`1957`, x=rep(2, NROW(dataf)), hjust=-0.1, size=3.5)
p <- p + geom_text(label="Time 1", x=1, y=1.1*(max(dataf$`1952`, dataf$`1957`)), hjust=1.2, size=5) # title
p <- p + geom_text(label="Time 2", x=2, y=1.1*(max(dataf$`1952`, dataf$`1957`)), hjust=-0.1, size=5) # title
# Minify theme
p + theme(panel.background = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
panel.border = element_blank(),
plot.margin = unit(c(1,2,1,2), "cm"))
library(ggalt)
health <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/health.csv")
health$Area <- factor(health$Area, levels = as.character(health$Area)) # for the correct ordering of the dumbbells
ggplot(health, aes(x = pct_2014, xend = pct_2013, y = Area, group = Area)) +
geom_dumbbell(color = "#a3c4dc", size = 0.75, colour_xend = "#0e668b") +
scale_x_continuous(label = scales::percent) + labs(x = NULL,
y = NULL, title = "Dumbbell Chart", subtitle = "Pct Change: 2013 vs 2014",
caption = "Source: https://github.com/hrbrmstr/ggalt") +
theme_classic() + theme(plot.title = element_text(hjust = 0.5,
face = "bold"), plot.background = element_rect(fill = "#f7f7f7"),
panel.background = element_rect(fill = "#f7f7f7"), panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(), panel.grid.major.x = element_line(),
axis.ticks = element_blank(), legend.position = "top", panel.border = element_blank())
theme_set(theme_classic()) # set the theme beforehand
# histogram on a continuous (numeric) variable
g <- ggplot(mpg, aes(displ)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(aes(fill=class),
binwidth = .1, # change binwidth
col="black",
size=.1) +
labs(title="Histogram with Auto Binning",
subtitle="Engine Displacement across Vehicle Classes")
g + geom_histogram(aes(fill=class),
bins=5, # change number of bins
col="black",
size=.1) +
labs(title="Histogram with Fixed Bins",
subtitle="Engine Displacement across Vehicle Classes")
theme_set(theme_classic())
# Histogram on a Categorical variable
g <- ggplot(mpg, aes(manufacturer))
g + geom_bar(aes(fill = class), width = 0.5) + theme(axis.text.x = element_text(angle = 65,
vjust = 0.6)) + labs(title = "Histogram on Categorical Variable",
subtitle = "Manufacturer across Vehicle Classes")
theme_set(theme_classic())
g <- ggplot(mpg, aes(cty))
g + geom_density(aes(fill = factor(cyl)), alpha = 0.8) + labs(title = "Density plot",
subtitle = "City Mileage Grouped by Number of cylinders",
caption = "Source: mpg", x = "City Mileage", fill = "# Cylinders")
theme_set(theme_classic())
g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(varwidth = TRUE, fill = "plum") + labs(title = "Box plot",
subtitle = "City Mileage grouped by Class of vehicle", caption = "Source: mpg",
x = "Class of Vehicle", y = "City Mileage")
g + geom_boxplot(aes(fill = factor(cyl))) + theme(axis.text.x = element_text(angle = 65,
vjust = 0.6)) + labs(title = "Box plot", subtitle = "City Mileage grouped by Class of vehicle",
caption = "Source: mpg", x = "Class of Vehicle", y = "City Mileage")
theme_set(theme_bw())
g <- ggplot(mpg, aes(manufacturer, cty))
g + geom_boxplot() + geom_dotplot(binaxis = "y", stackdir = "center",
dotsize = 0.5, fill = "red") + theme(axis.text.x = element_text(angle = 65,
vjust = 0.6)) + labs(title = "Box plot + Dot plot", subtitle = "City Mileage vs Class: Each dot represents 1 row in source data",
caption = "Source: mpg", x = "Class of Vehicle", y = "City Mileage")
Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
g + geom_boxplot(outlier.color = NA) + geom_point(position = position_jitter(width = 0.2),
size = 1, color = "red") + theme(axis.text.x = element_text(angle = 65,
vjust = 0.6)) + labs(title = "Box plot + Dot plot", subtitle = "City Mileage vs Class: Each dot represents 1 row in source data",
caption = "Source: mpg", x = "Class of Vehicle", y = "City Mileage")
p_load(ggthemes)
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/ggthemes_4.2.4.tgz'
Content type 'application/x-gzip' length 436158 bytes (425 KB)
==================================================
downloaded 425 KB
The downloaded binary packages are in
/var/folders/g9/qnw0ry294vs3gs4801w53bb80000gn/T//RtmpslQMjZ/downloaded_packages
ggthemes installed
theme_set(theme_tufte())
g <- ggplot(mpg, aes(manufacturer, cty))
g + geom_tufteboxplot() + theme(axis.text.x = element_text(angle = 65,
vjust = 0.6)) + labs(title = "Tufte Styled Boxplot", subtitle = "City Mileage grouped by Class of vehicle",
caption = "Source: mpg", x = "Class of Vehicle", y = "City Mileage")
theme_set(theme_bw())
g <- ggplot(mpg, aes(class, cty))
g + geom_violin() + labs(title = "Violin plot", subtitle = "City Mileage vs Class of vehicle",
caption = "Source: mpg", x = "Class of Vehicle", y = "City Mileage")
options(scipen = 999) # turns of scientific notations like 1e+40
# get data
email_campaign_funnel <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv")
# X axis breaks
brks <- seq(-15000000, 15000000, 5000000)
# X axis labels
lbls <- paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "m")
# pyramid
ggplot(email_campaign_funnel, aes(x = Stage, y = Users, fill = Gender)) + # Fill column
geom_bar(stat = "identity", width = .6) + # draw the bars
scale_y_continuous(breaks = brks, # Breaks
labels = lbls) + # Labels
coord_flip() + # Flip axes
labs(title="Email Campaign Funnel") +
theme_tufte() + # Tufte theme from ggthemes
theme(plot.title = element_text(hjust = .5), # Center plot title
axis.ticks = element_blank()) +
scale_fill_brewer(palette = "Dark2") # Color palette
var <- mpg$class # categorical data
table(var) # original category distribution
var
2seater compact midsize minivan pickup subcompact suv
5 47 41 11 33 35 62
#> var
#> 2seater compact midsize minivan pickup subcompact suv
#> 5 47 41 11 33 35 62
# data prep
nrows <- 10 # our waffle chart will be a 10x10 square
dataf <- expand.grid(y = 1:nrows, x = 1:nrows)
categ_table <- round(table(var) * ((nrows * nrows)/(length(var)))) # transform the category distribution so that the counts sum up to 100
categ_table
var
2seater compact midsize minivan pickup subcompact suv
2 20 18 5 14 15 26
#> var
#> 2seater compact midsize minivan pickup subcompact suv
#> 2 20 18 5 14 15 26
# > 2seater compact midsize minivan pickup subcompact suv >
# 2 20 18 5 14 15 26
sum(categ_table)
[1] 100
#> [1] 100
dataf$category <- factor(rep(names(categ_table), categ_table))
# NOTE: if sum(categ_table) is not 100 (i.e. nrows^2), it
# will need adjustment to make the sum to 100.
# waffle chart
ggplot(dataf, aes(x = x, y = y, fill = category)) + geom_tile(color = "black",
size = 0.5) + scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0,
0), trans = "reverse") + scale_fill_brewer(palette = "Set3") +
labs(title = "Waffle Chart", subtitle = "'Class' of vehicles",
caption = "Source: mpg") + theme(panel.border = element_rect(size = 2),
plot.title = element_text(size = rel(1.2)), axis.text = element_blank(),
axis.title = element_blank(), axis.ticks = element_blank(),
legend.title = element_blank(), legend.position = "right")
theme_set(theme_classic())
# Source: Frequency table
dataf <- as.data.frame(table(mpg$class))
colnames(dataf) <- c("class", "freq")
pie <- ggplot(dataf, aes(x = "", y = freq, fill = factor(class))) +
geom_bar(width = 1, stat = "identity") + theme(axis.line = element_blank(),
plot.title = element_text(hjust = 0.5)) + labs(fill = "class",
x = NULL, y = NULL, title = "Pie Chart of class", caption = "Source: mpg")
# what we got so far
pie + coord_polar(theta = "y", start = 0) + theme(axis.ticks = element_blank(),
axis.text = element_blank(), axis.title = element_blank(),
panel.grid = element_blank())
p_load(treemapify)
ggplot(G20, aes(area = gdp_mil_usd, fill = hdi, label = country, subgroup = region)) +
geom_treemap() + geom_treemap_subgroup_border() +
geom_treemap_subgroup_text(place = "centre", grow = T, alpha = 0.5,
colour = "black", fontface = "italic", min.size = 0) +
geom_treemap_text(colour = "white", place = "topleft", reflow = T)
ggplot(G20, aes(area = 1, label = country, subgroup = hemisphere,
subgroup2 = region, subgroup3 = econ_classification)) + geom_treemap() +
geom_treemap_subgroup3_border(colour = "blue", size = 1) +
geom_treemap_subgroup2_border(colour = "white", size = 3) +
geom_treemap_subgroup_border(colour = "red", size = 5) +
geom_treemap_subgroup_text(place = "middle", colour = "red",
alpha = 0.5, grow = T) + geom_treemap_subgroup2_text(colour = "white",
alpha = 0.5, fontface = "italic") + geom_treemap_subgroup3_text(place = "top",
colour = "blue", alpha = 0.5) + geom_treemap_text(colour = "white",
place = "middle", reflow = T)
# data prep: frequency table
freqtable <- table(mpg$manufacturer)
dataf <- as.data.frame.table(freqtable) %>%
rename(manufacturer = Var1)
theme_set(theme_classic())
g <- ggplot(dataf, aes(manufacturer, Freq))
g + geom_bar(stat = "identity", width = 0.5, fill = "tomato2") +
labs(title = "Bar Chart", subtitle = "Manufacturer of vehicles",
caption = "Source: Frequency of Manufacturers from 'mpg' dataset") +
theme(axis.text.x = element_text(angle = 65, vjust = 0.6))
g <- ggplot(mpg, aes(manufacturer))
g + geom_bar(aes(fill=class), width = 0.5) + # fill by class
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Categorywise Bar Chart",
subtitle="Manufacturer of vehicles",
caption="Source: Manufacturers from 'mpg' dataset")
p_load(ggfortify)
p_load(tidyverse)
p_load(zoo)
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/zoo_1.8-10.tgz'
Content type 'application/x-gzip' length 1044220 bytes (1019 KB)
==================================================
downloaded 1019 KB
The downloaded binary packages are in
/var/folders/g9/qnw0ry294vs3gs4801w53bb80000gn/T//RtmpslQMjZ/downloaded_packages
zoo installed
# load data
data("AirPassengers")
# check they are a ts object
class(AirPassengers)
[1] "ts"
theme_set(theme_classic())
autoplot(AirPassengers) + labs(title = "AirPassengers") + theme(plot.title = element_text(hjust = 0.5))
data(economics)
# (re)compute %Returns
economics$returns_perc <- c(0, diff(economics$psavert)/economics$psavert[-length(economics$psavert)])
theme_set(theme_classic())
# Allow Default X Axis Labels
ggplot(economics, aes(x = date)) + geom_line(aes(y = returns_perc)) +
labs(title = "Time Series Chart", subtitle = "Returns Percentage from 'Economics' Dataset",
caption = "Source: Economics", y = "Returns %")
library(lubridate)
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
theme_set(theme_bw())
# consider a 24-month timeframe
economics_m <- economics[1:24, ]
# labels and breaks for X axis text
lbls <- paste0(month.abb[month(economics_m$date)], " ", lubridate::year(economics_m$date)) # month.abb is a built-in constant
brks <- economics_m$date
head(brks)
[1] "1967-07-01" "1967-08-01" "1967-09-01" "1967-10-01" "1967-11-01" "1967-12-01"
#> [1] "1967-07-01" "1967-08-01" "1967-09-01" "1967-10-01" "1967-11-01"
#> [6] "1967-12-01"
head(lbls)
[1] "Jul 1967" "Aug 1967" "Sep 1967" "Oct 1967" "Nov 1967" "Dec 1967"
#> [1] "Jul 1967" "Aug 1967" "Sep 1967" "Oct 1967" "Nov 1967" "Dec 1967"
# plot
ggplot(economics_m, aes(x=date)) +
geom_line(aes(y=returns_perc)) +
labs(title="Monthly Time Series",
subtitle="Returns Percentage from Economics Dataset",
caption="Source: Economics",
y="Returns %") + # title and caption
scale_x_date(labels = lbls,
breaks = brks) + # change to monthly ticks and labels
theme(axis.text.x = element_text(angle = 90, vjust=0.5), # rotate x axis text
panel.grid.minor = element_blank())
theme_set(theme_bw())
# 7.5 years:
economics_y <- economics[1:90, ]
# labels and breaks for X axis text
brks <- economics_y$date[seq(1, length(economics_y$date), 12)] # one break at each year
lbls <- lubridate::year(brks)
# plot
ggplot(economics_y, aes(x=date)) +
geom_line(aes(y=returns_perc)) +
labs(title="Yearly Time Series",
subtitle="Returns Percentage from Economics Dataset",
caption="Source: Economics",
y="Returns %") + # title and caption
scale_x_date(labels = lbls,
breaks = brks) + # change to monthly ticks and labels
theme(axis.text.x = element_text(angle = 90, vjust=0.5), # rotate x axis text
panel.grid.minor = element_blank()) # turn off minor grid
data(economics_long, package = "ggplot2")
head(economics_long)
theme_set(theme_bw())
# filter & restrict to specific year range
dataf <- economics_long %>% dplyr::filter(variable %in% c("psavert", "uempmed"),
lubridate::year(date) %in% c(1967:1981))
table(dataf$variable)
psavert uempmed
174 174
#>
#> psavert uempmed
#> 174 174
# labels and breaks for X axis text
brks <- dataf$date[seq(1, length(dataf$date), 12)] # one break at each year
lbls <- lubridate::year(brks)
# plot
ggplot(dataf, aes(x=date)) +
geom_line(aes(y=value, col=variable)) +
labs(title="Time Series of Returns Percentage",
subtitle="Drawn from Long Data format",
caption="Source: Economics",
color=NULL) + # title and caption
scale_x_date(labels = lbls, breaks = brks) + # change to monthly ticks and labels
scale_color_manual(labels = c("psavert", "uempmed"),
values = c("psavert"="#00ba38", "uempmed"="#f8766d")) + # line color
theme(axis.text.x = element_text(angle = 90, vjust=0.5, size = 8), # rotate x axis text
panel.grid.minor = element_blank()) # turn off minor grid
theme_set(theme_bw())
dataf <- economics %>% dplyr::select(date, psavert, uempmed) %>%
dplyr::filter(lubridate::year(date) %in% c(1967:1981))
head(dataf)
# labels and breaks for X axis text
brks <- dataf$date[seq(1, length(dataf$date), 12)]
lbls <- lubridate::year(brks)
# plot
ggplot(dataf, aes(x=date)) +
geom_line(aes(y=psavert, col="psavert")) + # 1st line
geom_line(aes(y=uempmed, col="uempmed")) + # 2nd line
labs(title="Time Series of Returns Percentage",
subtitle="Drawn From Wide Data format",
caption="Source: Economics", y="value") + # title and caption
scale_x_date(labels = lbls, breaks = brks) + # change to monthly ticks and labels
scale_color_manual(name="",
values = c("psavert"="#00ba38", "uempmed"="#f8766d")) + # line color
theme(axis.text.x = element_text(angle = 90, vjust=0.5, size = 8),
panel.grid.minor = element_blank())
theme_set(theme_bw())
dataf <- economics %>% dplyr::select(date, psavert, uempmed) %>%
dplyr::filter(lubridate::year(date) %in% c(1967:1981))
# labels and breaks for X axis text
brks <- dataf$date[seq(1, length(dataf$date), 12)]
lbls <- lubridate::year(brks)
# plot
ggplot(dataf, aes(x=date)) +
geom_area(aes(y=psavert+uempmed, fill="psavert")) + # 1st "layer"
geom_area(aes(y=uempmed, fill="uempmed")) + # 2nd "layer" (plotted over the 1st)
labs(title="Area Chart of Returns Percentage",
subtitle="From Wide Data format",
caption="Source: Economics", y="value") + # title and caption
scale_x_date(labels = lbls, breaks = brks) + # change to monthly ticks and labels
scale_fill_manual(name="",
values = c("psavert"="#00ba38", "uempmed"="#f8766d")) + # line color
theme(panel.grid.minor = element_blank()) # turn off minor grid
library(plyr)
library(scales)
library(zoo)
dataf <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/yahoo.csv") # Yahoo! stock closing price 2007-2016
dataf$date <- as.Date(dataf$date) # format date
dataf <- dataf[dataf$year >= 2012, ] # filter years
# Create Month Week
dataf$yearmonth <- as.yearmon(dataf$date)
dataf$yearmonthf <- factor(dataf$yearmonth)
dataf <- ddply(dataf, .(yearmonthf), transform, monthweek = 1 +
week - min(week)) # compute week number of month
dataf <- dataf[, c("year", "yearmonthf", "monthf", "week", "monthweek",
"weekdayf", "VIX.Close")]
head(dataf)
ggplot(dataf, aes(monthweek, weekdayf, fill = VIX.Close)) + geom_tile(colour = "white") +
facet_grid(year ~ monthf) + scale_fill_gradient(low = "red",
high = "green") + labs(x = "Week of Month", y = "", title = "Time-Series Calendar Heatmap",
subtitle = "Yahoo Closing Price", fill = "Close")
theme_set(theme_classic())
source_df <- read.csv("https://raw.githubusercontent.com/jkeirstead/r-slopegraph/master/cancer_survival_rates.csv") # Estimates of % survival rates for different tumors
# Define functions. Source:
# https://github.com/jkeirstead/r-slopegraph Calculates
# slope graph positions based on Edward Tufte's layout
tufte_sort <- function(df, x = "year", y = "value", group = "group",
min.space = 0.05) {
## First rename the columns for consistency
ids <- match(c(x, y, group), names(df))
df <- df[, ids]
names(df) <- c("x", "y", "group")
## Expand grid to ensure every combination has a
## defined value
tmp <- expand.grid(x = unique(df$x), group = unique(df$group))
tmp <- merge(df, tmp, all.y = TRUE)
df <- mutate(tmp, y = ifelse(is.na(y), 0, y))
## Cast into a matrix shape and arrange by first column
require(reshape2)
tmp <- dcast(df, group ~ x, value.var = "y")
ord <- order(tmp[, 2])
tmp <- tmp[ord, ]
min.space <- min.space * diff(range(tmp[, -1]))
yshift <- numeric(nrow(tmp))
## Start at 'bottom' row Repeat for rest of the rows
## until you hit the top
for (i in 2:nrow(tmp)) {
## Shift subsequent row up by equal space so gap
## between two entries is >= minimum
mat <- as.matrix(tmp[(i - 1):i, -1])
d.min <- min(diff(mat))
yshift[i] <- ifelse(d.min < min.space, min.space - d.min,
0)
}
tmp <- cbind(tmp, yshift = cumsum(yshift))
scale <- 1
tmp <- melt(tmp, id = c("group", "yshift"), variable.name = "x",
value.name = "y")
## Store these gaps in a separate variable so that they
## can be scaled ypos = a*yshift + y
tmp <- transform(tmp, ypos = y + scale * yshift)
return(tmp)
}
plot_slopegraph <- function(df) {
ylabs <- subset(df, x == head(x, 1))$group
yvals <- subset(df, x == head(x, 1))$ypos
fontSize <- 3
gg <- ggplot(df, aes(x = x, y = ypos)) + geom_line(aes(group = group),
colour = "grey80") + geom_point(colour = "white", size = 8) +
geom_text(aes(label = y), size = fontSize, family = "American Typewriter") +
scale_y_continuous(name = "", breaks = yvals, labels = ylabs)
return(gg)
}
## Prepare data
dataf <- tufte_sort(source_df, x = "year", y = "value", group = "group",
min.space = 0.05)
Loading required package: reshape2
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
dataf <- transform(dataf, x = factor(x, levels = c(5, 10, 15,
20), labels = c("5 years", "10 years", "15 years", "20 years")),
y = round(y))
## Plot
plot_slopegraph(dataf) + labs(title = "Estimates of % survival rates") +
theme(axis.title = element_blank(), axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, family = "American Typewriter",
face = "bold"), axis.text = element_text(family = "American Typewriter",
face = "bold"))
p_load(forecast)
theme_set(theme_classic())
# Subset data
nottem_small <- window(nottem, start = c(1920, 1), end = c(1925,
12)) # subset a smaller timewindow
# Plot
ggseasonplot(AirPassengers) + labs(title = "Seasonal plot: International Airline Passengers")
ggseasonplot(nottem_small) + labs(title = "Seasonal plot: Air temperatures at Nottingham Castle")
# install.packages(ggdendro)
p_load(ggdendro)
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.1/ggdendro_0.1.23.tgz'
Content type 'application/x-gzip' length 173221 bytes (169 KB)
==================================================
downloaded 169 KB
The downloaded binary packages are in
/var/folders/g9/qnw0ry294vs3gs4801w53bb80000gn/T//RtmpslQMjZ/downloaded_packages
ggdendro installed
theme_set(theme_bw())
hc <- hclust(dist(USArrests), method = "average") # hierarchical clustering
ggdendrogram(hc, rotate = TRUE, size = 2)
# load/reload libraries as needed
p_load(ggalt)
p_load(ggfortify)
theme_set(theme_classic())
# we'll use the Iris dataset
# filter out the Species column
dataf <- iris %>% dplyr::select(-Species)
# compute the principal components
pca_mod <- prcomp(dataf)
# convert to dataframe & add back the Species column
df_pc <- data.frame(pca_mod$x, Species=iris$Species)
# create the subsetted dataframes to be encircled in the plot
df_pc_vir <- df_pc %>% dplyr::filter(Species == "virginica") # df for 'virginica'
df_pc_set <- df_pc %>% dplyr::filter(Species == "setosa") # df for 'setosa'
df_pc_ver <- df_pc %>% dplyr::filter(Species == "versicolor") # df for 'versicolor'
ggplot(df_pc, aes(PC1, PC2, col=Species)) + # base call
geom_point(aes(shape=Species), size=2) + # add points
labs(title="Iris Clusters",
subtitle="With principal components PC1 and PC2 as X and Y axis",
caption="Source: Iris") +
coord_cartesian(xlim = 1.2 * c(min(df_pc$PC1), max(df_pc$PC1)),
ylim = 1.2 * c(min(df_pc$PC2), max(df_pc$PC2))) + # change axis limits (without deleting points)
geom_encircle(data = df_pc_vir, aes(x=PC1, y=PC2)) + # draw circles
geom_encircle(data = df_pc_set, aes(x=PC1, y=PC2)) +
geom_encircle(data = df_pc_ver, aes(x=PC1, y=PC2))
library(plotly)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following objects are masked from ‘package:plyr’:
arrange, mutate, rename, summarise
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
data <- data.frame(cond = rep(c("condition_1", "condition_2"),
each = 10), my_x = 1:100 + rnorm(100, sd = 9), my_y = 1:100 +
rnorm(100, sd = 16))
my_graph <- ggplot(data, aes(x = my_x, y = my_y)) + geom_point(shape = 1)
# Let's make it interactive using the ggplotly function !
p <- ggplotly(my_graph)
p
INCOMPLETE CHECK LESSON 6 IF YOU WANT
p_load(gapminder)
data <- gapminder
my_graph <- data %>%
ggplot(aes(x = gdpPercap, y = lifeExp, col = continent, size = pop)) +
geom_point(alpha = 0.8) + theme_minimal() + theme(legend.position = "bottom") +
guides(size = "none") + labs(x = "GDP per Capita", y = "Life Expectancy",
col = "")
p_load(gganimate)
p <- my_graph + transition_time(year)
animate(p, width = 700, height = 450, renderer = ffmpeg_renderer(format = "webm"))
Rendering [>--------------------------------------------------------------------------------------------------------------------------] at 2.6 fps ~ eta: 38s
Rendering [=>-------------------------------------------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 43s
Rendering [===>-----------------------------------------------------------------------------------------------------------------------] at 2.1 fps ~ eta: 46s
Rendering [====>----------------------------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 44s
Rendering [=====>---------------------------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 43s
Rendering [======>--------------------------------------------------------------------------------------------------------------------] at 2.1 fps ~ eta: 44s
Rendering [========>------------------------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 43s
Rendering [=========>-----------------------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 41s
Rendering [==========>----------------------------------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 40s
Rendering [===========>---------------------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 41s
Rendering [=============>-------------------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 40s
Rendering [==============>------------------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 39s
Rendering [===============>-----------------------------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 38s
Rendering [================>----------------------------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 38s
Rendering [=================>---------------------------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 37s
Rendering [===================>-------------------------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 37s
Rendering [====================>------------------------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 37s
Rendering [=====================>-----------------------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 36s
Rendering [======================>----------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 37s
Rendering [========================>--------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 36s
Rendering [=========================>-------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 36s
Rendering [==========================>------------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 35s
Rendering [===========================>-----------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 35s
Rendering [=============================>---------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 35s
Rendering [==============================>--------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 34s
Rendering [===============================>-------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 33s
Rendering [================================>------------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 33s
Rendering [=================================>-----------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 32s
Rendering [===================================>---------------------------------------------------------------------------------------] at 2.3 fps ~ eta: 31s
Rendering [====================================>--------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 32s
Rendering [=====================================>-------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 31s
Rendering [======================================>------------------------------------------------------------------------------------] at 2.2 fps ~ eta: 30s
Rendering [========================================>----------------------------------------------------------------------------------] at 2.2 fps ~ eta: 30s
Rendering [=========================================>---------------------------------------------------------------------------------] at 2.2 fps ~ eta: 29s
Rendering [==========================================>--------------------------------------------------------------------------------] at 2.2 fps ~ eta: 29s
Rendering [===========================================>-------------------------------------------------------------------------------] at 2.2 fps ~ eta: 29s
Rendering [=============================================>-----------------------------------------------------------------------------] at 2.2 fps ~ eta: 29s
Rendering [==============================================>----------------------------------------------------------------------------] at 2.2 fps ~ eta: 28s
Rendering [===============================================>---------------------------------------------------------------------------] at 2.2 fps ~ eta: 28s
Rendering [================================================>--------------------------------------------------------------------------] at 2.2 fps ~ eta: 27s
Rendering [=================================================>-------------------------------------------------------------------------] at 2.2 fps ~ eta: 27s
Rendering [===================================================>-----------------------------------------------------------------------] at 2.2 fps ~ eta: 26s
Rendering [====================================================>----------------------------------------------------------------------] at 2.2 fps ~ eta: 26s
Rendering [=====================================================>---------------------------------------------------------------------] at 2.2 fps ~ eta: 25s
Rendering [======================================================>--------------------------------------------------------------------] at 2.2 fps ~ eta: 24s
Rendering [========================================================>------------------------------------------------------------------] at 2.3 fps ~ eta: 24s
Rendering [=========================================================>-----------------------------------------------------------------] at 2.3 fps ~ eta: 23s
Rendering [==========================================================>----------------------------------------------------------------] at 2.3 fps ~ eta: 23s
Rendering [===========================================================>---------------------------------------------------------------] at 2.3 fps ~ eta: 22s
Rendering [=============================================================>-------------------------------------------------------------] at 2.3 fps ~ eta: 22s
Rendering [==============================================================>------------------------------------------------------------] at 2.3 fps ~ eta: 21s
Rendering [===============================================================>-----------------------------------------------------------] at 2.3 fps ~ eta: 21s
Rendering [================================================================>----------------------------------------------------------] at 2.3 fps ~ eta: 20s
Rendering [=================================================================>---------------------------------------------------------] at 2.3 fps ~ eta: 20s
Rendering [===================================================================>-------------------------------------------------------] at 2.3 fps ~ eta: 19s
Rendering [====================================================================>------------------------------------------------------] at 2.3 fps ~ eta: 19s
Rendering [=====================================================================>-----------------------------------------------------] at 2.3 fps ~ eta: 19s
Rendering [======================================================================>----------------------------------------------------] at 2.3 fps ~ eta: 18s
Rendering [========================================================================>--------------------------------------------------] at 2.3 fps ~ eta: 18s
Rendering [=========================================================================>-------------------------------------------------] at 2.3 fps ~ eta: 17s
Rendering [==========================================================================>------------------------------------------------] at 2.3 fps ~ eta: 17s
Rendering [===========================================================================>-----------------------------------------------] at 2.3 fps ~ eta: 16s
Rendering [============================================================================>----------------------------------------------] at 2.4 fps ~ eta: 16s
Rendering [==============================================================================>--------------------------------------------] at 2.4 fps ~ eta: 15s
Rendering [===============================================================================>-------------------------------------------] at 2.4 fps ~ eta: 15s
Rendering [================================================================================>------------------------------------------] at 2.4 fps ~ eta: 14s
Rendering [=================================================================================>-----------------------------------------] at 2.4 fps ~ eta: 14s
Rendering [===================================================================================>---------------------------------------] at 2.4 fps ~ eta: 13s
Rendering [====================================================================================>--------------------------------------] at 2.4 fps ~ eta: 13s
Rendering [=====================================================================================>-------------------------------------] at 2.4 fps ~ eta: 13s
Rendering [======================================================================================>------------------------------------] at 2.4 fps ~ eta: 12s
Rendering [========================================================================================>----------------------------------] at 2.4 fps ~ eta: 12s
Rendering [=========================================================================================>---------------------------------] at 2.4 fps ~ eta: 11s
Rendering [==========================================================================================>--------------------------------] at 2.4 fps ~ eta: 11s
Rendering [===========================================================================================>-------------------------------] at 2.4 fps ~ eta: 10s
Rendering [============================================================================================>------------------------------] at 2.4 fps ~ eta: 10s
Rendering [==============================================================================================>----------------------------] at 2.4 fps ~ eta: 10s
Rendering [===============================================================================================>---------------------------] at 2.4 fps ~ eta: 9s
Rendering [================================================================================================>--------------------------] at 2.4 fps ~ eta: 9s
Rendering [=================================================================================================>-------------------------] at 2.4 fps ~ eta: 8s
Rendering [===================================================================================================>-----------------------] at 2.4 fps ~ eta: 8s
Rendering [====================================================================================================>----------------------] at 2.4 fps ~ eta: 7s
Rendering [=====================================================================================================>---------------------] at 2.4 fps ~ eta: 7s
Rendering [======================================================================================================>--------------------] at 2.4 fps ~ eta: 7s
Rendering [========================================================================================================>------------------] at 2.4 fps ~ eta: 6s
Rendering [=========================================================================================================>-----------------] at 2.4 fps ~ eta: 6s
Rendering [==========================================================================================================>----------------] at 2.4 fps ~ eta: 5s
Rendering [===========================================================================================================>---------------] at 2.4 fps ~ eta: 5s
Rendering [============================================================================================================>--------------] at 2.4 fps ~ eta: 5s
Rendering [==============================================================================================================>------------] at 2.4 fps ~ eta: 4s
Rendering [===============================================================================================================>-----------] at 2.4 fps ~ eta: 4s
Rendering [================================================================================================================>----------] at 2.4 fps ~ eta: 3s
Rendering [=================================================================================================================>---------] at 2.4 fps ~ eta: 3s
Rendering [===================================================================================================================>-------] at 2.4 fps ~ eta: 3s
Rendering [====================================================================================================================>------] at 2.4 fps ~ eta: 2s
Rendering [=====================================================================================================================>-----] at 2.4 fps ~ eta: 2s
Rendering [======================================================================================================================>----] at 2.3 fps ~ eta: 1s
Rendering [========================================================================================================================>--] at 2.3 fps ~ eta: 1s
Rendering [=========================================================================================================================>-] at 2.3 fps ~ eta: 0s
Rendering [===========================================================================================================================] at 2.3 fps ~ eta: 0s
sh: ffmpeg: command not found
Warning in system2(ffmpeg, c("-pattern_type sequence", paste0("-r ", fps), :
error in running command
Error: Rendering with ffmpeg failed
p <- my_graph + transition_time(year) + labs(title = "Year: {frame_time}")
animate(p, width = 700, height = 450, renderer = ffmpeg_renderer(format = "webm"))
p <- my_graph + geom_text(aes(x = min(gdpPercap), y = min(lifeExp),
label = as.factor(year)), hjust = -2, vjust = -0.2, alpha = 0.2,
col = "gray", size = 20) + transition_states(as.factor(year),
state_length = 0)
animate(p, width = 700, height = 450, renderer = ffmpeg_renderer(format = "webm"))
library(tidyverse)
p_load(janitor)
moma <- read_csv("data_artworks.csv", col_types = cols(BeginDate = col_number(),
EndDate = col_number(), `Length (cm)` = col_number(), `Circumference (cm)` = col_number(),
`Duration (sec.)` = col_number(), `Diameter (cm)` = col_number())) %>%
clean_names()
Warning: One or more parsing issues, see `problems()` for details
problems(moma)
library(stringr)
moma <- moma %>%
mutate(gender = str_replace_all(gender, fixed("(female)",
ignore_case = TRUE), "F"), gender = str_replace_all(gender,
fixed("(male)", ignore_case = TRUE), "M"), num_artists = str_count(gender,
"[:alpha:]"), num_artists = na_if(num_artists, 0), n_female_artists = str_count(gender,
"F"), n_male_artists = str_count(gender, "M"), artist_gender = case_when(num_artists ==
1 & n_female_artists == 1 ~ "Female", num_artists ==
1 & n_male_artists == 1 ~ "Male"))
What different kinds of art classifications are available?
moma %>%
distinct(classification) %>%
print(n = Inf)
NA
filter on paintings
library(tidyr)
moma <- moma %>%
filter(classification == "Painting") %>%
drop_na(height_cm, width_cm) %>%
filter(height_cm > 0 & width_cm > 0)
select some columsn
moma <- moma %>%
select(title, contains("artist"), contains("year"), contains("_cm"),
classification, department)
write_csv(moma, "artworks-cleaned.csv")
p_load(here)
p_load(readr)
p_load(dplyr)
moma <- read_csv("artworks-cleaned.csv")
Rows: 2253 Columns: 16
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): title, artist, artist_bio, artist_gender, classification, department
dbl (6): num_artists, n_female_artists, n_male_artists, depth_cm, height_cm, width_cm
lgl (4): circumference_cm, diameter_cm, length_cm, seat_height_cm
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
some general info like the number
moma_dim <- moma %>%
filter(height_cm < 600, width_cm < 760) %>%
mutate(hw_ratio = height_cm/width_cm, hw_cat = case_when(hw_ratio >
1 ~ "taller than wide", hw_ratio < 1 ~ "wider than tall",
hw_ratio == 1 ~ "perfect square"))
library(ggthemes) # to load the fivethirtyeight theme
ggplot(moma_dim, aes(x = width_cm, y = height_cm, colour = hw_cat)) +
geom_point(alpha = 0.5) + ggtitle("MoMA Paintings, Tall and Wide") +
scale_colour_manual(name = "", values = c("gray50", "#FF9900",
"#B14CF0")) + theme_fivethirtyeight() + theme(axis.title = element_text()) +
labs(x = "Width", y = "Height")
ggplot(moma_dim, aes(x = width_cm, y = height_cm, colour = hw_cat)) +
geom_point(alpha = 0.5, show.legend = FALSE) + ggtitle("MoMA Paintings, Tall and Wide") +
scale_colour_manual(name = "", values = c("gray50", "#ee5863",
"#6999cd")) + theme_fivethirtyeight() + theme(axis.title = element_text()) +
labs(x = "Width", y = "Height") + annotate(x = 200, y = 380,
geom = "text", label = "Taller than\nWide", color = "#ee5863",
size = 5, hjust = 1, fontface = 2) + annotate(x = 375, y = 100,
geom = "text", label = "Wider than\nTall", color = "#6999cd",
size = 5, hjust = 0, fontface = 2)